Effect of UPSTM-Based
Decorrelation on Feature Discovery
Loading the
libraries
library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
library(TH.data)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
Material and
Methods
All variables are derived from a laser scanning image of the eye
background taken by the Heidelberg Retina Tomograph. Most of the
variables describe either the area or volume in certain parts of the
papilla and are measured in four sectors (temporal, superior, nasal and
inferior) as well as for the whole papilla (global). The global
measurement is, roughly, the sum of the measurements taken in the four
sector.
The observations in both groups are matched by age and sex to prevent
any bias.
Source Torsten Hothorn and Berthold Lausen (2003), Double-Bagging:
Combining classifiers by bootstrap aggregation. Pattern Recognition,
36(6), 1303–1309.
The Data
GlaucomaM {TH.data}
data("GlaucomaM")
pander::pander(table(GlaucomaM$Class))
GlaucomaM$Class <- 1*(GlaucomaM$Class=="glaucoma")
Standarize the
names for the reporting
studyName <- "GlaucomaM"
dataframe <- GlaucomaM
outcome <- "Class"
thro <- 0.80
TopVariables <- 10
cexheat = 0.25
Generaring the
report
Libraries
Some libraries
library(psych)
library(whitening)
library("vioplot")
library("rpart")
Data specs
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
pander::pander(table(dataframe[,outcome]))
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1500
Scaling the
data
Scaling and removing near zero variance columns and highly
co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples
dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData
The heatmap of the
data
numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000
if (!largeSet)
{
hm <- heatMaps(data=dataframeScaled[1:numsub,],
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}

Correlation
Matrix of the Data
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Pearson Correlation|",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9961105
The
decorrelation
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> Included: 61 , Uni p: 0.004032258 , Outcome-Driven Size: 0 , Base Size: 13 , Rcrit: 0.1887431
#>
#>
1 <R=0.996,thr=0.900,N= 40>, Top: 6( 6 )[ 1 : 6 Fa= 6 : 0.900 ]( 6 , 27 , 0 ),<|>Tot Used: 33 , Added: 27 , Zero Std: 0 , Max Cor: 0.910
#>
2 <R=0.910,thr=0.900,N= 40>, Top: 3( 1 )[ 1 : 3 Fa= 9 : 0.900 ]( 3 , 3 , 6 ),<|>Tot Used: 35 , Added: 3 , Zero Std: 0 , Max Cor: 0.980
#>
3 <R=0.980,thr=0.900,N= 40>, Top: 1( 1 )[ 1 : 1 Fa= 10 : 0.900 ]( 1 , 1 , 9 ),<|>Tot Used: 35 , Added: 1 , Zero Std: 0 , Max Cor: 0.941
#>
4 <R=0.941,thr=0.900,N= 40>, Top: 2( 1 )[ 1 : 2 Fa= 10 : 0.900 ]( 1 , 1 , 10 ),<|>Tot Used: 35 , Added: 1 , Zero Std: 0 , Max Cor: 0.896
#>
5 <R=0.896,thr=0.800,N= 29>, Top: 9( 1 )[ 1 : 9 Fa= 14 : 0.800 ]( 9 , 17 , 10 ),<|>Tot Used: 51 , Added: 17 , Zero Std: 0 , Max Cor: 0.906
#>
6 <R=0.906,thr=0.900,N= 2>, Top: 1( 1 )[ 1 : 1 Fa= 14 : 0.900 ]( 1 , 1 , 14 ),<|>Tot Used: 51 , Added: 1 , Zero Std: 0 , Max Cor: 0.861
#>
7 <R=0.861,thr=0.800,N= 6>, Top: 3( 1 )[ 1 : 3 Fa= 16 : 0.800 ]( 3 , 3 , 14 ),<|>Tot Used: 51 , Added: 3 , Zero Std: 0 , Max Cor: 0.849
#>
8 <R=0.849,thr=0.800,N= 6>, Top: 1( 1 )[ 1 : 1 Fa= 16 : 0.800 ]( 1 , 1 , 16 ),<|>Tot Used: 51 , Added: 1 , Zero Std: 0 , Max Cor: 0.799
#>
9 <R=0.799,thr=0.800,N= 6>
#>
[ 9 ], 0.7561781 Decor Dimension: 51 Nused: 51 . Cor to Base: 37 , ABase: 6 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
3.03
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
1.17
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
3.94
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
3.11
The decorrelation
matrix
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPLTM <- attr(DEdataframe,"UPLTM")
gplots::heatmap.2(1.0*(abs(UPLTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}

The correlation
matrix after decorrelation
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after IDeA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Pearson Correlation|",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.799364
U-MAP Visualization
of features
The UMAP based on
LASSO on Raw Data
if (nrow(dataframe) < 1000)
{
classes <- unique(dataframe[1:numsub,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

The decorralted
UMAP
if (nrow(dataframe) < 1000)
{
datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

Univariate
Analysis
Univariate
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
Final Table
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
| vari |
0.04480 |
0.0368 |
0.1150 |
0.0543 |
0.1035 |
0.880 |
| varg |
0.17851 |
0.1170 |
0.4139 |
0.1967 |
0.0252 |
0.873 |
| vars |
0.04506 |
0.0314 |
0.1068 |
0.0596 |
0.0172 |
0.851 |
| tmi |
0.03664 |
0.1258 |
-0.1097 |
0.1038 |
0.6980 |
0.832 |
| varn |
0.08224 |
0.0595 |
0.1774 |
0.0894 |
0.2533 |
0.830 |
| hic |
0.39863 |
0.1407 |
0.2114 |
0.1574 |
0.7659 |
0.822 |
| tmg |
-0.03356 |
0.0885 |
-0.1524 |
0.0927 |
0.5179 |
0.818 |
| phcg |
-0.03546 |
0.0691 |
-0.1216 |
0.0661 |
0.9359 |
0.818 |
| phci |
0.00898 |
0.0882 |
-0.0937 |
0.0779 |
0.7638 |
0.814 |
| tms |
0.03959 |
0.1166 |
-0.1192 |
0.1376 |
0.9756 |
0.808 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
pander::pander(finalTable)
| varg |
0.17851 |
0.11700 |
0.4139 |
0.1967 |
0.025181 |
0.873 |
| tmg |
-0.03356 |
0.08846 |
-0.1524 |
0.0927 |
0.517931 |
0.818 |
| phcg |
-0.03546 |
0.06909 |
-0.1216 |
0.0661 |
0.935907 |
0.818 |
| phci |
0.00898 |
0.08818 |
-0.0937 |
0.0779 |
0.763829 |
0.814 |
| rnf |
0.13956 |
0.06764 |
0.2252 |
0.0975 |
0.225229 |
0.800 |
| phcn |
0.00692 |
0.07364 |
-0.0717 |
0.0881 |
0.303322 |
0.796 |
| vart |
0.00640 |
0.00533 |
0.0146 |
0.0117 |
0.000754 |
0.777 |
| La_mdic |
0.07786 |
0.05356 |
0.0351 |
0.0380 |
0.136027 |
0.766 |
| mhcg |
0.12197 |
0.05958 |
0.0663 |
0.0660 |
0.925585 |
0.756 |
| hvc |
0.31316 |
0.10125 |
0.4077 |
0.1208 |
0.287631 |
0.738 |
| La_eas |
-0.08615 |
0.05796 |
-0.1572 |
0.1220 |
0.002144 |
0.716 |
| La_eat |
-0.02645 |
0.04201 |
-0.0617 |
0.0580 |
0.123097 |
0.708 |
| La_hic |
-0.09706 |
0.02623 |
-0.0782 |
0.0330 |
0.260690 |
0.701 |
| La_vbrt |
-0.02560 |
0.01535 |
-0.0340 |
0.0198 |
0.375356 |
0.676 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
theCharformulas <- attr(dc,"LatentCharFormulas")
finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| vari |
NA |
0.04480 |
0.03683 |
0.1150 |
0.0543 |
0.103543 |
0.880 |
0.880 |
NA |
| varg |
NA |
0.17851 |
0.11700 |
0.4139 |
0.1967 |
0.025181 |
0.873 |
0.873 |
3 |
| varg1 |
NA |
0.17851 |
0.11700 |
0.4139 |
0.1967 |
0.025181 |
0.873 |
NA |
NA |
| vars |
NA |
0.04506 |
0.03143 |
0.1068 |
0.0596 |
0.017158 |
0.851 |
0.851 |
NA |
| tmi |
NA |
0.03664 |
0.12578 |
-0.1097 |
0.1038 |
0.697965 |
0.832 |
0.832 |
NA |
| varn |
NA |
0.08224 |
0.05953 |
0.1774 |
0.0894 |
0.253346 |
0.830 |
0.830 |
NA |
| hic |
NA |
0.39863 |
0.14073 |
0.2114 |
0.1574 |
0.765865 |
0.822 |
0.822 |
NA |
| tmg |
NA |
-0.03356 |
0.08846 |
-0.1524 |
0.0927 |
0.517931 |
0.818 |
0.818 |
2 |
| tmg1 |
NA |
-0.03356 |
0.08846 |
-0.1524 |
0.0927 |
0.517931 |
0.818 |
NA |
NA |
| phcg |
NA |
-0.03546 |
0.06909 |
-0.1216 |
0.0661 |
0.935907 |
0.818 |
0.818 |
NA |
| phcg1 |
NA |
-0.03546 |
0.06909 |
-0.1216 |
0.0661 |
0.935907 |
0.818 |
NA |
NA |
| phci |
NA |
0.00898 |
0.08818 |
-0.0937 |
0.0779 |
0.763829 |
0.814 |
0.814 |
NA |
| phci1 |
NA |
0.00898 |
0.08818 |
-0.0937 |
0.0779 |
0.763829 |
0.814 |
NA |
NA |
| tms |
NA |
0.03959 |
0.11659 |
-0.1192 |
0.1376 |
0.975586 |
0.808 |
0.808 |
NA |
| rnf |
NA |
0.13956 |
0.06764 |
0.2252 |
0.0975 |
0.225229 |
0.800 |
0.800 |
NA |
| phcn |
NA |
0.00692 |
0.07364 |
-0.0717 |
0.0881 |
0.303322 |
0.796 |
0.796 |
NA |
| vart |
NA |
0.00640 |
0.00533 |
0.0146 |
0.0117 |
0.000754 |
0.777 |
0.777 |
NA |
| La_mdic |
- (0.276)vbsg + mdic |
0.07786 |
0.05356 |
0.0351 |
0.0380 |
0.136027 |
0.766 |
0.785 |
1 |
| mhcg |
NA |
0.12197 |
0.05958 |
0.0663 |
0.0660 |
0.925585 |
0.756 |
0.756 |
4 |
| hvc |
NA |
0.31316 |
0.10125 |
0.4077 |
0.1208 |
0.287631 |
0.738 |
0.738 |
NA |
| La_eas |
- (0.233)ag + eas |
-0.08615 |
0.05796 |
-0.1572 |
0.1220 |
0.002144 |
0.716 |
0.621 |
-1 |
| La_eat |
- (0.173)ag + eat |
-0.02645 |
0.04201 |
-0.0617 |
0.0580 |
0.123097 |
0.708 |
0.575 |
-1 |
| La_hic |
+ hic - (1.240)mhcg - (0.077)vbsg - (0.981)mdic |
-0.09706 |
0.02623 |
-0.0782 |
0.0330 |
0.260690 |
0.701 |
0.822 |
-3 |
| La_vbrt |
- (0.019)vbsg - (0.856)vbst + vbrt |
-0.02560 |
0.01535 |
-0.0340 |
0.0198 |
0.375356 |
0.676 |
0.709 |
-1 |
Comparing IDeA vs
PCA vs EFA
PCA
featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE) #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous])
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)
#pander::pander(pc$rotation)
PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])
gplots::heatmap.2(abs(PCACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "PCA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")

EFA
EFAdataframe <- dataframeScaled
if (length(iscontinous) < 2000)
{
topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
if (topred < 2) topred <- 2
uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE) # EFA analysis
predEFA <- predict(uls,dataframeScaled[,iscontinous])
EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous])
EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
gplots::heatmap.2(abs(EFACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "EFA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
}

Effect on CAR
modeling
par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(rawmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
}

pander::pander(table(dataframe[,outcome],pr))
pander::pander(ptab$detail[c(5,3,4,6),])
| 5 |
diag.ac |
0.867 |
0.812 |
0.911 |
| 3 |
se |
0.888 |
0.808 |
0.943 |
| 4 |
sp |
0.847 |
0.760 |
0.912 |
| 6 |
diag.or |
43.764 |
19.005 |
100.778 |
par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(IDeAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
}

pander::pander(table(DEdataframe[,outcome],pr))
pander::pander(ptab$detail[c(5,3,4,6),])
| 5 |
diag.ac |
0.878 |
0.823 |
0.920 |
| 3 |
se |
0.888 |
0.808 |
0.943 |
| 4 |
sp |
0.867 |
0.784 |
0.927 |
| 6 |
diag.or |
51.713 |
21.954 |
121.814 |
par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(PCAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
pander::pander(ptab$detail[c(5,3,4,6),])
| 5 |
diag.ac |
0.832 |
0.772 |
0.881 |
| 3 |
se |
0.796 |
0.703 |
0.871 |
| 4 |
sp |
0.867 |
0.784 |
0.927 |
| 6 |
diag.or |
25.500 |
11.891 |
54.684 |
par(op)
EFA
EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(EFAmodel,EFAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(EFAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
}

pander::pander(table(EFAdataframe[,outcome],pr))
pander::pander(ptab$detail[c(5,3,4,6),])
| 5 |
diag.ac |
0.821 |
0.761 |
0.872 |
| 3 |
se |
0.765 |
0.669 |
0.845 |
| 4 |
sp |
0.878 |
0.796 |
0.935 |
| 6 |
diag.or |
23.370 |
10.890 |
50.149 |
par(op)